{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "orig_nbformat": 2,
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "npconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": 3,
    "colab": {
      "name": "multi_training_demo.ipynb",
      "provenance": [],
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/Tomiinek/Multilingual_Text_to_Speech/blob/master/notebooks/multi_training_demo.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text",
        "id": "F3AJf3TaPqJ9"
      },
      "source": [
        "# Multilingual Text-to-Speech Demo\n",
        "\n",
        "This notebook demonstrates multilingual text-to-speech (without voice cloning) using:\n",
        "\n",
        "- Tacotron based spectrogram generation: https://github.com/Tomiinek/Multilingual_Text_to_Speech\n",
        "- WaveRNN vocoder: https://github.com/Tomiinek/WaveRNN, forked from fatchord/WaveRNN\n",
        "\n",
        "\n",
        "**Estimated time to complete**: 4 minutes\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab_type": "code",
        "id": "OYiFm0mCPYGl",
        "colab": {}
      },
      "source": [
        "import sys\n",
        "import os\n",
        "import IPython\n",
        "from IPython.display import Audio"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text",
        "id": "KXpZDAbjP68U"
      },
      "source": [
        "## Clone repositories"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab_type": "code",
        "id": "Nc3RKiiUQD2I",
        "colab": {}
      },
      "source": [
        "os.chdir(os.path.expanduser(\"~\"))\n",
        "    \n",
        "tacotron_dir = \"Multilingual_Text_to_Speech\"\n",
        "if not os.path.exists(tacotron_dir):\n",
        "  ! git clone https://github.com/Tomiinek/Multilingual_Text_to_Speech # $tacotron_dir\n",
        "\n",
        "wavernn_dir = \"WaveRNN\"\n",
        "if not os.path.exists(wavernn_dir):\n",
        "  ! git clone https://github.com/Tomiinek/$wavernn_dir"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text",
        "id": "QgLULYkZQMdA"
      },
      "source": [
        "## Download pre-trained models"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab_type": "code",
        "id": "ynJ9gv1fQOWV",
        "colab": {}
      },
      "source": [
        "! mkdir -p checkpoints\n",
        "os.chdir(os.path.join(os.path.expanduser(\"~\"), \"checkpoints\"))\n",
        "\n",
        "tacotron_chpt = \"generated_training.pyt\"\n",
        "if not os.path.exists(os.path.join(os.path.expanduser(\"~\"), \"checkpoints\", tacotron_chpt)):\n",
        "  ! curl -O -L \"https://github.com/Tomiinek/Multilingual_Text_to_Speech/releases/download/v1.0/$tacotron_chpt\"  \n",
        "\n",
        "wavernn_chpt = \"wavernn_weight.pyt\"\n",
        "if not os.path.exists(os.path.join(os.path.expanduser(\"~\"), \"checkpoints\", wavernn_chpt)):\n",
        "  ! curl -O -L \"https://github.com/Tomiinek/Multilingual_Text_to_Speech/releases/download/v1.0/$wavernn_chpt\"     \n",
        "\n",
        "os.chdir(os.path.expanduser(\"~\"))"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text",
        "id": "e5XnBICcQj5K"
      },
      "source": [
        "## Install dependencies"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab_type": "code",
        "id": "i-ag6P26Qlcl",
        "colab": {}
      },
      "source": [
        "! pip install -q -U soundfile\n",
        "! pip install -q -U phonemizer\n",
        "! pip install -q -U epitran"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text",
        "id": "zBXJF72LQril"
      },
      "source": [
        "## Input texts to be synthesized"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text",
        "id": "JRJzeUQjQscZ"
      },
      "source": [
        "Inputs consist of **two parts delimited** by `|`:\n",
        "  - **Input utterance** - Only a basic normalization is applied to input utterances, so **you should not use obscure characters and punctuation**. See examples that are formatted properly.\n",
        "  - **Language or Speaker ID** - There are more available language IDs, but **you should use just one of** `spanish` (M), `french` (M), `german` (F), `greek` (F), `dutch` (M), `finnish` (M), `hungarian` (F), `chinese` (F), `japanese` (M), and `russian` (M).\n",
        "\n",
        "Feel free to modify the examples below.\n",
        "\n",
        "\n",
        "\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab_type": "code",
        "id": "SKv_QZlEQsub",
        "colab": {}
      },
      "source": [
        "inputs = [\n",
        "          \"Las hojas con los cantos afilados cortan los objetos que se sitúan entre ellas.|spanish\",\n",
        "          \"C'est l'un des plus beaux palais de Mistretta dont le nom dérive d'une ancienne famille seigneuriale de la ville.|french\",\n",
        "          \"Nach seiner Rückkehr wurde er Vorstand der Abteilung für Landesaufnahme im sächsischen Generalstab.|german\",\n",
        "          \"Αυτές οι κριτικές αναζητήσεις εμφανίζονται στην ελληνική φωτογραφία μόλις στα τέλη της δεκαετίας.|greek\",\n",
        "          \"Soms is dit inderdaad een papieren bonnetje, maar vaak ook een plastic muntje of een metalen munt.|dutch\",\n",
        "          \"Päälaivojen korkeus antoi tilaa klerestorioikkunoille sivulaivojen yläpuolelle.|finnish\",  \n",
        "          \"Háborús felek között általánosan elismerten a megadás vagy a fegyverszünet jeleként ismert a legtöbbek számára.|hungarian\",\n",
        "          \"jìsuànjī dàxué zhǔyào xuékē shì kēxué hé jìzhúbù， xuéshēng kěyǐ huòqǔ jìsuànjīkēxué hé jìzhú de běnkē xuéwèi。|chinese\",\n",
        "          \"yokuasa、 saheiji ha riyuu wo tsuke te jibun ha mou ippaku suru mune wo nakama ni tsuge、 mina wo kaeshi te shimau。|japanese\",\n",
        "          \"Из города к церкви по склону холма ведут роскошно декорированная лестница в стиле необарокко.|russian\",         \n",
        "]"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text",
        "id": "03v8NQRiQwyo"
      },
      "source": [
        "## Synthesis"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text",
        "id": "_Y6xDGJzQ7qQ"
      },
      "source": [
        "### Spectrogram generation"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab_type": "code",
        "id": "PXPPrkHlQ-79",
        "colab": {}
      },
      "source": [
        "os.chdir(os.path.join(os.path.expanduser(\"~\"), tacotron_dir))\n",
        "if \"utils\" in sys.modules: del sys.modules[\"utils\"]\n",
        "\n",
        "from synthesize import synthesize\n",
        "from utils import build_model\n",
        "\n",
        "model = build_model(os.path.join(os.path.expanduser(\"~\"), \"checkpoints\", tacotron_chpt))\n",
        "model.eval()\n",
        "\n",
        "spectrograms = []\n",
        "for i in inputs:\n",
        "  tokens = i.split(\"|\")\n",
        "  s = synthesize(model, \"|\" + tokens[0] + \"|\" + tokens[1] + \"|\" + tokens[1])\n",
        "  spectrograms.append(s)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text",
        "id": "VbvnE-olQ_UG"
      },
      "source": [
        "### Waveform generation"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab_type": "code",
        "id": "TCrfKz1zQ_sQ",
        "colab": {}
      },
      "source": [
        "os.chdir(os.path.join(os.path.expanduser(\"~\"), wavernn_dir))\n",
        "if \"utils\" in sys.modules: del sys.modules[\"utils\"]\n",
        "\n",
        "from models.fatchord_version import WaveRNN\n",
        "from utils import hparams as hp\n",
        "from gen_wavernn import generate\n",
        "import torch\n",
        "\n",
        "hp.configure('hparams.py')\n",
        "model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, \n",
        "                feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, \n",
        "                hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))\n",
        "model.load(os.path.join(os.path.expanduser(\"~\"), \"checkpoints\", wavernn_chpt))\n",
        "\n",
        "waveforms = [generate(model, s, hp.voc_gen_batched, hp.voc_target, hp.voc_overlap) for s in spectrograms]"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text",
        "id": "fA_YgxG4RAFO"
      },
      "source": [
        "## Resulting audios"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab_type": "code",
        "id": "kkc2lPz8RAjv",
        "colab": {}
      },
      "source": [
        "for idx, w in enumerate(waveforms):\n",
        "  print(inputs[idx])\n",
        "  IPython.display.display(IPython.display.Audio(data=w, rate=hp.sample_rate))"
      ],
      "execution_count": 0,
      "outputs": []
    }
  ]
}